knitr::opts_chunk$set(echo = TRUE)

# Install package
# (after you install, you can delete this line and keep only the 'library' line)
# install.packages(c("tidyverse", "gridExtra", "kableExtra"))

# Load package to environment
library(tidyverse)
library(gridExtra)
library(kableExtra)

Import Comma-separated files

  1. Comma-separated files (.csv) are the most common data files.

  2. Before you import, make sure your data is saved inside your project folder where your .Rproj file is located.

  3. Here I use the built-in function read.csv() to load a .csv file. Since we use Rproject, we only need to type in the relative file path instead of the absolute file path.

  4. To find out your absolute path, use getwd().

  5. If you don’t use Rproject, you can also install and load the here package to find your absolute file path (google the package to learn more).

# Load csv files
gapminder <- read.csv("data/gapminder.csv")
tidy_df1 <- read.csv("data/tidy_example_1.csv")
tidy_df2 <- read.csv("data/tidy_example_2.csv")

Browse data in R

  1. You can click the little table logo next to your data object in the Enviornment panel to view data, or type View(data_object) in the R Console.

  2. Common things to check about your data:

  1. You can also view the first and last several rows of your data: head(data_object), tail(data_object)
# View data in a pop-up window
View(gapminder)

# Summary statistics by variable
summary(gapminder)

# variable names
names(gapminder)

# number of rows and columnes
nrow(gapminder)
ncol(gapminder)

# Check first several observations
head(gapminder, n = 10)

# Check last several observations
tail(gapminder, n = 5)

Basic tidyverse command

  1. We are going to use functions from the dplyr package under the tidyverse package collection.

  2. Let’s start with the five basic command for manipulating data frame

# ---------- Filter ---------- 
# Filter only Asian countries:
filter(gapminder, continent == "Asia")

# Or: Filter observations whose continent equals either Asia or Americas
filter(gapminder, continent == "Asia" | continent == "Americas")

# Or: Instead of using | , you can use %in% followed by a value vector 
filter(gapminder, continent %in% c("Asia", "Americas"))

# And: Filter observations that satisfy both conditions
filter(gapminder, continent == "Asia" & year == 2007)

# Negation: not equal to
filter(gapminder, continent != "Asia")

# Negation: filter values that's not equal to any value included in the vector
filter(gapminder, !(continent %in% c("Asia", "Americas")))

# Combine AND and Negation:
filter(gapminder, continent == "Asia" & year != 2007)

# Filter only 2007 data:
filter(gapminder, year == 2007)
# Save data as a new data object
gapminder_2007 <- filter(gapminder, year == 2007)
# ---------- Arrange ---------- 
# Arrange() helps you sort observations
# Sort by GDP per capita, from lowest to highest
arrange(gapminder_2007, gdpPercap)

# Sort by GDP per capita, from highest to lowest
# Use the desc() function to your variable to sort in descending order
arrange(gapminder_2007, desc(gdpPercap))
# ---------- Slice ---------- 
# Select the 2nd row of the data
slice(gapminder, 2)

# Select 2nd to 10th row of the data
slice(gapminder, 2:10)
# ---------- Select ---------- 
# Select desired variables by name
select(gapminder, country, pop)

# You can deselect using - before column name
select(gapminder, -country, -pop)

# Select all columns between country and year (inclusive)
select(gapminder, country:year)
# ---------- Rename ---------- 
rename(gapminder, population = pop)
# ---------- Mutate ---------- 
# Create new variables using "mutate"
mutate(gapminder, 
       gdpPercap_in_thousand = gdpPercap/1000,
       gdp = pop * gdpPercap,
       log_gdp = log(gdp), # natural log
       log2_gdp = log2(gdp),
       id = row_number())  # create id by row number

“Pipe” in tidyverse coding

  1. For the benefit of writing and reading codes, there is a special syntax in tidyverse called “piping.”

  2. The general idea is that you start from a dataframe, or an object, and use the pipe %>% command to lay out the actions you want to take to that object.

For example, if you want to first arrange the GDP per Capita in descending order, and then keep the top 5 countries, you can pipe these two actions:

gapminder_2007 %>%
  arrange(desc(gdpPercap)) %>%
  slice(1:5)

You can also do the above use one single function, which can also be piped to your data object

gapminder_2007 %>% top_n(5, wt = gdpPercap)
  1. Piping make your code looks cleaner and is more efficient because you don’t need to save objects created in the middle steps.

For example, we can skip the step to create gapminder_2007 by piping the filter step to the gapminder data:

gapminder %>% 
  filter(year == 2007) %>%
  arrange(desc(gdpPercap)) %>%
  slice(1:5)
  1. Piping also makes plotting easier, which we will cover later.

Make untidy data tidy

  1. As we covered in the lecture, before analyzing data, make sure it’s tidy.

  2. What if it’s not tidy?

  1. Remeber what count as tidy depends on your questions, specifically, what count as an observation in your study?
# Observe the data structure of the two untidy examples
# If in our study, a unit of observation is a country's stats in a particular year
# What changes are needed in order to make them tidy?
View(tidy_df1)
View(tidy_df2)


# For df1, we need to first bring years from column names to a variable
# then put values of "cases" and "population" in two columns

tidy_df1 %>%
  
  # bring years from column names to a variable
  pivot_longer(cols = c(year_1999, year_2000), 
               names_to = "year", 
               values_to = "count") %>%
  
  # remove "year_" prefix in the year variable
  mutate(year = str_remove(year, "year_")) %>% 
  
  # put values of "cases" and "population" in two columns
  pivot_wider(names_from = type, 
              values_from = count)


# For df2, we need "cases" and "population" to have their own columns
tidy_df2 %>%
  pivot_wider(names_from = type, 
              values_from = count)



# You can save clean df as a new object, 
tidy_clean <- tidy_df2 %>%
  pivot_wider(names_from = type, 
              values_from = count)


# And export as .csv to your data folder
write.csv(tidy_clean, "data/tidy_clean.csv", row.names = F)


# You can manipulate your table layout in HTML 
# using kable_styling() from the kableExtra package:
tidy_clean %>%
  kbl() %>%
  kable_styling()

Summarise and group data

  1. The summarise() function collapses many values down to a single summary, e.g. mean, median, standard deviation, max, min, etc.

  2. The group_by() function creates a grouped copy of a table, thus you can apply various functions to each group.

  3. Combining group_by() with summarise(), you can get various descriptve statistics for your data, either for the entire dataset, or by group (e.g. groups by gender, race, education level, etc.).

# Example for summarise()
gapminder %>%
  filter(year == 2007) %>%
  summarise(avg_life = mean(lifeExp))


# Example for combining group_by() and summarise()
gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarise(avg_life = mean(lifeExp))


# You can get many different summary statistics for each group using summarise()
gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarise(year = 2007,
            n_country = n(),
            max_gdpPercap = max(gdpPercap), 
            min_gdpPercap = min(gdpPercap), 
            mean_gdpPercap = mean(gdpPercap),
            sd_gdpPercap = sd(gdpPercap))

Plot using ggplot2

  1. As covered in our lecture, making plot is usually the very first thing we do to understand data. Ploting can reveal trends that it’s hard to observe by just looking at the values in a table.

  2. In general, for univariant analysis (i.e. one variable), you will look at the histogram plot for numeric variables, and a frequency bar plot for categorical variables. For bivariant analysis (i.e. two variables), you will look at a scatterplot for numeric variables.

  3. You can plot in R using the built-in R ploting functions, but a more popular syntax is ggplot2, a package that’s already included when you load the tidyverse package.

  4. Basic syntax for ggplot2:

data %>%
  ggplot() + 
  geom_xxx(mapping = aes()) + 
  ...

Histograms

To check the distribution of one numeric variable.

# Histogram:
# For example, if we want to know the distribution of GDP per Capita in the data
gapminder %>%
  ggplot(aes(x = gdpPercap)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# You can adjust the binwidth and other features of the histogram
gapminder %>%
  ggplot(aes(x = gdpPercap)) +
  geom_histogram(binwidth = 2000, fill = "grey", color = "black")

# Instead of frequency (count) you can use percentage on the Y axis, and overlay a density curve
# Notice we have different aesthetic mappings for geom_histograme and geom_density
gapminder %>%
  ggplot() +
  geom_histogram(
    aes(x = gdpPercap, y = ..density..),
    binwidth = 2000, 
    fill = "grey", 
    color = "black"
    ) + 
  geom_density(
    aes(x = gdpPercap),
    color = "red"
    )

# You can do log transform directly when plotting
gapminder %>%
  ggplot() +
  geom_histogram(
    aes(x = log(gdpPercap), y = ..density..),
    binwidth = 0.1, 
    fill = "grey", 
    color = "black"
    ) + 
  geom_density(
    aes(x = log(gdpPercap)),
    color = "red"
    )

Bar Plots

To check the distribution of one categorical or numeric-discrete variable.

# Bar plot:
# For example, to check the number of countries with a life expectancy less than 60 over the years  
gapminder %>%
  filter(lifeExp <= 60) %>%
  ggplot(aes(x = year)) +
  geom_bar()

# You can add a group variable using the `fill =` argument in aesthetic mapping
gapminder %>%
  filter(lifeExp <= 60) %>%
  ggplot(aes(x = year, fill = continent)) +
  geom_bar()

# The positions of the bar can be customized to "fill", "dodge", or "stack" (default)
gapminder %>%
  filter(lifeExp <= 60) %>%
  ggplot(aes(x = year, fill = continent)) +
  geom_bar(position = "fill")

gapminder %>%
  filter(lifeExp <= 60) %>%
  ggplot(aes(x = year, fill = continent)) +
  geom_bar(position = "dodge")

gapminder %>%
  filter(lifeExp <= 60) %>%
  ggplot(aes(x = year, fill = continent)) +
  geom_bar(position = "stack")

Scatter Plots

To check the joint distribution of two numeric variables.

# Scatter plot: 
# Relationship between GDP per Capita and life expectancy 
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point()

# You can also add additional aesthetic mapping arguments to show group differences 
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
  geom_point(alpha = 0.7, size = 1.5)

# Let's see how the scatter plot of log(GDP per Capital) and life expectancy 
gapminder %>%
  ggplot(aes(x = log(gdpPercap), y = lifeExp, color = continent)) +
  geom_point(alpha = 0.7, size = 1.5)

Box Plots

  • Either for the distribution of one numeric variable, or for a categorical variable along the x axis, and a numeric variable along the y axis

  • A box plot visualises five summary statistics: the median, two hinges and two whiskers, and all “outlying” points individually.

  • The lower and upper hinges correspond to the first and third quartiles (the 25th and 75th percentiles),

  • The upper whisker extends from the hinge to the largest value no further than \(1.5 * IQR\) from the hinge. The lower whisker extends from the hinge to the smallest value at most \(1.5 * IQR\) of the hinge. (IQR: interquartile range, the range from 25th to 75th percentile in the data.)

  • Data beyond the end of the whiskers are called “outlying” points and are plotted individually. You can set the outlier points to have a distinct color by using outlier.color = "color_name" within geom_boxplot().

  • You can customize the quartiles for your boxplot in the aesthetic mapping for geom_boxplot(). Read its documentation for guidance by type ?geom_boxplot() in R Console.

# Box plot: 

# Check the distribution of one numeric variable
gapminder %>%
  filter(continent == "Americas") %>%
  ggplot(aes(x = gdpPercap)) +
  geom_boxplot()

# Check the distribution of a discrete variable along x axis and a continuious variable along y axix
gapminder %>%
  ggplot(aes(x = continent, y = gdpPercap)) +
  geom_boxplot(outlier.colour = "hotpink", outlier.shape = 1)

Line plot

# geom_line() + geom_point() are often used to plot change over time
gapminder %>%
  filter(country == "Sweden") %>% 
  ggplot(aes(x = year, y = gdpPercap)) + 
  geom_point() + 
  geom_line()

# You can use the "color" argument in asethetic mapping to plot trend by group
# For example, if we want to compare GDP trend over years for BRIC countires:
gapminder %>%
  filter(country %in% c("India", "Russia", "Brazil", "China")) %>% 
  ggplot(aes(x = year, y = gdpPercap, color = country)) + 
  geom_point() + 
  geom_line()

Fitting Model Curves using geom_smooth

  • geom_smooth can estimate the relationship between x and y based on the model you choose to fit.

  • It’s useful as an exploratory tool.

  • It includes linear and nonparametric (nonlinear) methods which can be useful if you want to compare the fit between linear and nonlinear assumptions.

  • We usually plot the smoothing on top of a scatter plot, so that we can see how well the model curve fits the data.

# For example, if we want to explore the relationship between GDP and life expectancy

# If we fit the data with linear models
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point(shape = 1, alpha = 0.5) +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

# If we fit the data with a nonlinear assumption, 
# There are vaious smoothing method you can choose from. See documentation for details. 
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point(shape = 1, alpha = 0.5) +
  geom_smooth(method = "loess")
## `geom_smooth()` using formula 'y ~ x'

# Let's try with the log(gdpPercap)
gapminder %>%
  ggplot(aes(x = log(gdpPercap), y = lifeExp)) +
  geom_point(shape = 1, alpha = 0.5) +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

gapminder %>%
  ggplot(aes(x = log(gdpPercap), y = lifeExp)) +
  geom_point(shape = 1, alpha = 0.5) +
  geom_smooth(method = "loess")
## `geom_smooth()` using formula 'y ~ x'

Faceting

  • Faceting creates subplots based on one or more discrete variables

  • Faceting is useful when you want to compare or display relationships by groups in seperate plots.

# For example, if we want to compare how the relationship between 
# lifeExp and gdpPercap have changed from the 1950s to 2000s in five continents:
gapminder %>%
  # The %% operater calculates the remainder of a division, 
  # here we use it to create a "decade" variable
  mutate(decade = as.character(year - year %% 10)) %>%
  filter(decade == "1950" | decade == "2000") %>%
  ggplot(aes(x = log(gdpPercap), y = lifeExp, color = decade)) + 
  geom_point(alpha = 0.5) + 
  facet_wrap(vars(continent))

# You can also use facet_grid and customize how your panels are displayed
# For example, if you want to display the GDP trend for BRIC countries (except Russia b/c it's not in the dataset):

# Arrange your plots as rows
gapminder %>%
  mutate(decade = as.character(year - year %% 10)) %>%
  filter(decade == "1950" | decade == "2000") %>%
  ggplot(aes(x = log(gdpPercap), y = lifeExp, color = decade)) + 
  geom_point(alpha = 0.5) + 
  facet_grid(rows = vars(continent))

# Arrange your plots as columns
gapminder %>%
  mutate(decade = as.character(year - year %% 10)) %>%
  filter(decade == "1950" | decade == "2000") %>%
  ggplot(aes(x = log(gdpPercap), y = lifeExp, color = decade)) + 
  geom_point(alpha = 0.5) + 
  facet_grid(cols = vars(continent))

Arrange plots using grid.arrange()

You can put together graphs using grid.arrange() from the gridExtra package.

plot1 = gapminder %>%
  ggplot(aes(x = log(gdpPercap), y = lifeExp)) +
  geom_point(shape = 1, alpha = 0.5) +
  geom_smooth(method = "lm")

plot2 = gapminder %>%
  ggplot(aes(x = log(gdpPercap), y = lifeExp)) +
  geom_point(shape = 1, alpha = 0.5) +
  geom_smooth(method = "loess")

grid.arrange(plot1, plot2, ncol = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Manipulate plot layout

  1. To clearly communicate your data, please always make sure:
  • Your axes are readable, including the tick labels
  • Your plot has a title or caption
  • The size, shape, and color of your plot is easy to follow
  1. You can add title and axes labels using + labs()

  2. You can manipulate the font size, angle, and position of axes by using + theme(axist.text.x = ..., axist.text.y = ...)

  3. Customize your axes’ breaks using + scale_x_continuous() or + scale_x_discrete()

  4. You can also turn your colorful plot to greyscales by using + scale_colour_grey() for points, lines, etc. and + scale_fill_grey() for box plot, bar plot, violin plot, etc.

  5. You can also adjust the theme by + theme_xxx() –see ggplot2 cheatsheet

  6. There are a million things you can do to manipulate your plot. Google it or use the ggplot2 cheatsheet to discover more.

# Let's add titles and optimize plot layout for the BRIC country GDP plot
gapminder %>%
  filter(country %in% c("India", "Russia", "Brazil", "China")) %>% 
  ggplot(aes(x = year, y = gdpPercap, color = country)) + 
  geom_point() + 
  geom_line() + 
  labs(title = "GDP per capita in Brail, China, and India (1952 to 2007)", 
       x = NULL,
       y = "GDP per capita",
       color = "Country") +
  theme_minimal() +
  scale_color_grey() +
  scale_x_continuous(breaks = unique(gapminder$year)) +
  theme(axis.text.x = element_text(size = 8, angle = 40, vjust = 0.6))

Saving plots in R

  • Use ggsave() for quick saving

  • If ggsave doesn’t work, you can use the conventional saving method in R:

# 1. Open a file (can be png, jpeg, pdf, etc.)
png("graph/plot_name.png", width = 350, height = "350")

# 2. Create the plot

ggplot() + geom_xxx()
     
# 3. Close the file
dev.off()
# To save a plot, add ggsave(filename = , plot = ) to save
# If you don't name the plot specifically, it automatically save the last plot you've run
# For example, since we just ran the above plot, we can save it:
ggsave("graph/gdp_bric.png")

# You can customize the specs of the image:
ggsave("graph/gdp_bric_2.png", height = 4, width = 6, units = c("in"), dpi = 160)

Exercise

  1. In the gapminder data, which are the top 5 countries in Europe in terms of their GDP per Capital in 2002? Use dyplr functions to create a table for your result.

  2. Using the gapminder data, generate a table summarising the mean, median, and standard deviation of life expadency in Europe and Africa in 2002.

  3. Replicate this plot below using the gapminder data and ggplot2?